In [1]:
import pandas as pd
import numpy as np
import os
import datetime
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn import tree
from sklearn import ensemble

import pytz
import itertools
import visualize
import utils
import pydotplus
import xgboost as xgb

from sklearn import metrics
from sklearn import model_selection

import pvlib
import cs_detection

import visualize_plotly as visualize
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

from IPython.display import Image

%load_ext autoreload
%autoreload 2

np.set_printoptions(precision=4)
%matplotlib notebook

Ground predictions

PVLib Clearsky

Only making ground predictions using PVLib clearsky model and statistical model. NSRDB model won't be available to ground measurements.

In [2]:
nsrdb = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz')
nsrdb.df.index = nsrdb.df.index.tz_convert('MST')
nsrdb.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')

Train/test on NSRDB data to find optimal parameters

Default classifier

In [3]:
train = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None)
test.trim_dates('01-01-2015', None)
In [4]:
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [5]:
clf = ensemble.RandomForestClassifier(n_jobs=-1)
In [6]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [7]:
train.df.keys()
Out[7]:
Index(['GHI', 'Clearsky GHI', 'Cloud Type', 'sky_status', 'Clearsky GHI pvlib',
       'Clearsky GHI stat', 'Clearsky GHI stat smooth', 'ghi_status', 'scale',
       'tfn', 'GHI gradient', 'GHI gradient second',
       'Clearsky GHI pvlib gradient', 'Clearsky GHI pvlib gradient second',
       'abs_ideal_ratio_diff', 'abs_ideal_ratio_diff mean',
       'abs_ideal_ratio_diff std', 'abs_ideal_ratio_diff max',
       'abs_ideal_ratio_diff min', 'abs_ideal_ratio_diff grad',
       'abs_ideal_ratio_diff grad mean', 'abs_ideal_ratio_diff grad std',
       'abs_ideal_ratio_diff grad max', 'abs_ideal_ratio_diff grad min',
       'abs_ideal_ratio_diff grad second',
       'abs_ideal_ratio_diff grad second mean',
       'abs_ideal_ratio_diff grad second std',
       'abs_ideal_ratio_diff grad second max',
       'abs_ideal_ratio_diff grad second min', 'GHI line length',
       'Clearsky GHI pvlib line length',
       'GHI Clearsky GHI pvlib line length ratio',
       'GHI Clearsky GHI pvlib ratio', 'GHI Clearsky GHI pvlib ratio mean',
       'GHI Clearsky GHI pvlib ratio std', 'GHI Clearsky GHI pvlib ratio max',
       'GHI Clearsky GHI pvlib ratio min', 'GHI Clearsky GHI pvlib diff',
       'GHI Clearsky GHI pvlib diff mean', 'GHI Clearsky GHI pvlib diff std',
       'GHI Clearsky GHI pvlib diff max', 'GHI Clearsky GHI pvlib diff min',
       'GHI Clearsky GHI pvlib abs_diff',
       'GHI Clearsky GHI pvlib abs_diff mean',
       'GHI Clearsky GHI pvlib abs_diff std',
       'GHI Clearsky GHI pvlib abs_diff max',
       'GHI Clearsky GHI pvlib abs_diff min'],
      dtype='object')
In [8]:
feature_cols = [
    'tfn',
    #  'ghi_status',
    'abs_ideal_ratio_diff',
    'abs_ideal_ratio_diff mean',
    'abs_ideal_ratio_diff std',
    'abs_ideal_ratio_diff grad',
    'abs_ideal_ratio_diff grad mean', 
    'abs_ideal_ratio_diff grad std',
    'abs_ideal_ratio_diff grad second',
    'abs_ideal_ratio_diff grad second mean',
    'abs_ideal_ratio_diff grad second std',
    'GHI Clearsky GHI pvlib line length ratio',
    'GHI Clearsky GHI pvlib abs_diff',
    'GHI Clearsky GHI pvlib abs_diff mean',
    'GHI Clearsky GHI pvlib abs_diff std'
]

target_cols = ['sky_status']
In [9]:
for k in feature_cols:
    print(k, train.df[k].isnull().values.any())
tfn False
abs_ideal_ratio_diff False
abs_ideal_ratio_diff mean False
abs_ideal_ratio_diff std False
abs_ideal_ratio_diff grad False
abs_ideal_ratio_diff grad mean False
abs_ideal_ratio_diff grad std False
abs_ideal_ratio_diff grad second False
abs_ideal_ratio_diff grad second mean False
abs_ideal_ratio_diff grad second std False
GHI Clearsky GHI pvlib line length ratio False
GHI Clearsky GHI pvlib abs_diff False
GHI Clearsky GHI pvlib abs_diff mean False
GHI Clearsky GHI pvlib abs_diff std False
In [10]:
vis = visualize.Visualizer()
vis.plot_corr_matrix(train.df[feature_cols].corr(), feature_cols)
/Users/benellis/miniconda3/lib/python3.5/site-packages/seaborn/palettes.py:727: DeprecationWarning:

object of type <class 'float'> cannot be safely interpreted as an integer.

/Users/benellis/miniconda3/lib/python3.5/site-packages/seaborn/palettes.py:727: DeprecationWarning:

object of type <class 'float'> cannot be safely interpreted as an integer.

In [11]:
clf.fit(train.df[train.df['GHI'] > 0][feature_cols].values, train.df[train.df['GHI'] > 0][target_cols].values.flatten())
Out[11]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
In [12]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

In [13]:
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only')
vis.add_line_ser(test.df['abs_ideal_ratio_diff'])
vis.add_line_ser(test.df['abs_ideal_ratio_diff grad'])
vis.add_line_ser(test.df['abs_ideal_ratio_diff grad mean'])
vis.add_line_ser(test.df['abs_ideal_ratio_diff grad std'])
vis.add_line_ser(test.df['abs_ideal_ratio_diff grad second'])
vis.add_line_ser(test.df['abs_ideal_ratio_diff grad second mean'])
vis.add_line_ser(test.df['abs_ideal_ratio_diff grad second std'])
vis.add_line_ser(test.df['GHI Clearsky GHI pvlib abs_diff'])
vis.add_line_ser(test.df['GHI Clearsky GHI pvlib abs_diff mean'])
vis.add_line_ser(test.df['GHI Clearsky GHI pvlib abs_diff std'])
vis.show()
In [14]:
cm = metrics.confusion_matrix(test.df['sky_status'].values, pred)
vis = visualize.Visualizer()
vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])
In [15]:
bar = go.Bar(x=feature_cols, y=clf.feature_importances_)
iplot([bar])

Gridsearch

In [16]:
import warnings
In [17]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    params={}
    params['max_depth'] = [4, 8, 12, 16]
    params['n_estimators'] = [64]
    params['class_weight'] = [None, 'balanced']
    params['min_samples_leaf'] = [1, 2, 3]
    results = []
    for depth, nest, cw, min_samples in itertools.product(params['max_depth'], params['n_estimators'], params['class_weight'], params['min_samples_leaf']):
        print('Params:')
        print('depth: {}, n_estimators: {}, class_weight: {}, min_samples_leaf: {}'.format(depth, nest, cw, min_samples))
        train2 = cs_detection.ClearskyDetection(train.df)
        train2.trim_dates('01-01-1999', '01-01-2014')
        utils.calc_all_window_metrics(train2.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
        test2 = cs_detection.ClearskyDetection(train.df)
        test2.trim_dates('01-01-2014', '01-01-2015')
        clf = ensemble.RandomForestClassifier(max_depth=depth, n_estimators=nest, class_weight=cw, min_samples_leaf=min_samples, n_jobs=-1)
        clf.fit(train2.df[train2.df['GHI'] > 0][feature_cols].values, train2.df[train2.df['GHI'] > 0][target_cols].values.flatten())
        
        print('\t Scores:')
        test_pred = test2.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True)
        accuracy_score = metrics.accuracy_score(test2.df['sky_status'], test_pred)
        print('\t\t accuracy: {}'.format(accuracy_score))
        f1_score = metrics.f1_score(test2.df['sky_status'], test_pred)
        print('\t\t f1:{}'.format(f1_score))
        recall_score = metrics.recall_score(test2.df['sky_status'], test_pred)
        print('\t\t recall:{}'.format(recall_score))
        precision_score = metrics.precision_score(test2.df['sky_status'], test_pred)
        print('\t\t precision:{}'.format(precision_score))
        results.append({'max_depth': depth, 'n_estimators': nest, 'class_weight': cw, 'min_samples_leaf': min_samples,
                        'accuracy': accuracy_score, 'f1': f1_score, 'recall': recall_score, 'precision': precision_score})
Params:
depth: 4, n_estimators: 64, class_weight: None, min_samples_leaf: 1
	 Scores:
		 accuracy: 0.9482305936073059
		 f1:0.9078532967591182
		 recall:0.9294778448096526
		 precision:0.8872120730738682
Params:
depth: 4, n_estimators: 64, class_weight: None, min_samples_leaf: 2
	 Scores:
		 accuracy: 0.9482305936073059
		 f1:0.9078345696575552
		 recall:0.9292698148533389
		 precision:0.8873659117997617
Params:
depth: 4, n_estimators: 64, class_weight: None, min_samples_leaf: 3
	 Scores:
		 accuracy: 0.9478881278538813
		 f1:0.9070548712206047
		 recall:0.9267734553775744
		 precision:0.8881578947368421
Params:
depth: 4, n_estimators: 64, class_weight: balanced, min_samples_leaf: 1
	 Scores:
		 accuracy: 0.9470890410958904
		 f1:0.9052050311892832
		 recall:0.9207405866444768
		 precision:0.8901850362027354
Params:
depth: 4, n_estimators: 64, class_weight: balanced, min_samples_leaf: 2
	 Scores:
		 accuracy: 0.947945205479452
		 f1:0.9070336391437309
		 recall:0.9255252756396921
		 precision:0.8892664401359185
Params:
depth: 4, n_estimators: 64, class_weight: balanced, min_samples_leaf: 3
	 Scores:
		 accuracy: 0.9474885844748858
		 f1:0.9061224489795918
		 recall:0.9236530060328687
		 precision:0.8892449429200882
Params:
depth: 8, n_estimators: 64, class_weight: None, min_samples_leaf: 1
	 Scores:
		 accuracy: 0.9519977168949771
		 f1:0.9151790216843168
		 recall:0.9438319117952986
		 precision:0.8882145653876272
Params:
depth: 8, n_estimators: 64, class_weight: None, min_samples_leaf: 2
	 Scores:
		 accuracy: 0.9525684931506849
		 f1:0.9161706849591446
		 recall:0.9446640316205533
		 precision:0.8893458676067372
Params:
depth: 8, n_estimators: 64, class_weight: None, min_samples_leaf: 3
	 Scores:
		 accuracy: 0.9516552511415525
		 f1:0.914418510659796
		 recall:0.941335552319534
		 precision:0.8889980353634578
Params:
depth: 8, n_estimators: 64, class_weight: balanced, min_samples_leaf: 1
	 Scores:
		 accuracy: 0.951769406392694
		 f1:0.9144303797468354
		 recall:0.9392552527563969
		 precision:0.8908839779005525
Params:
depth: 8, n_estimators: 64, class_weight: balanced, min_samples_leaf: 2
	 Scores:
		 accuracy: 0.9515981735159817
		 f1:0.9140308191403083
		 recall:0.937799043062201
		 precision:0.8914376112319558
Params:
depth: 8, n_estimators: 64, class_weight: balanced, min_samples_leaf: 3
	 Scores:
		 accuracy: 0.9514269406392694
		 f1:0.9138053276612985
		 recall:0.9384231329311421
		 precision:0.8904461113304382
Params:
depth: 12, n_estimators: 64, class_weight: None, min_samples_leaf: 1
	 Scores:
		 accuracy: 0.9526255707762558
		 f1:0.9168003207698476
		 recall:0.9513209902225921
		 precision:0.8846972335074482
Params:
depth: 12, n_estimators: 64, class_weight: None, min_samples_leaf: 2
	 Scores:
		 accuracy: 0.9522831050228311
		 f1:0.9161147902869757
		 recall:0.9496567505720824
		 precision:0.8848614072494669
Params:
depth: 12, n_estimators: 64, class_weight: None, min_samples_leaf: 3
	 Scores:
		 accuracy: 0.9523401826484018
		 f1:0.9161225514816673
		 recall:0.9486166007905138
		 precision:0.8857808857808858
Params:
depth: 12, n_estimators: 64, class_weight: balanced, min_samples_leaf: 1
	 Scores:
		 accuracy: 0.9524543378995434
		 f1:0.9160366898498135
		 recall:0.9452881214894945
		 precision:0.8885412592882284
Params:
depth: 12, n_estimators: 64, class_weight: balanced, min_samples_leaf: 2
	 Scores:
		 accuracy: 0.9521118721461187
		 f1:0.9153636638757188
		 recall:0.9438319117952986
		 precision:0.8885624755189973
Params:
depth: 12, n_estimators: 64, class_weight: balanced, min_samples_leaf: 3
	 Scores:
		 accuracy: 0.9519406392694064
		 f1:0.9151893634165994
		 recall:0.9450800915331807
		 precision:0.8871314196446006
Params:
depth: 16, n_estimators: 64, class_weight: None, min_samples_leaf: 1
	 Scores:
		 accuracy: 0.9515981735159817
		 f1:0.914927768860353
		 recall:0.9486166007905138
		 precision:0.8835496996706065
Params:
depth: 16, n_estimators: 64, class_weight: None, min_samples_leaf: 2
	 Scores:
		 accuracy: 0.9508561643835617
		 f1:0.9134412385643913
		 recall:0.9450800915331807
		 precision:0.883852140077821
Params:
depth: 16, n_estimators: 64, class_weight: None, min_samples_leaf: 3
	 Scores:
		 accuracy: 0.9507420091324201
		 f1:0.9131703390683167
		 recall:0.9440399417516122
		 precision:0.8842556508183944
Params:
depth: 16, n_estimators: 64, class_weight: balanced, min_samples_leaf: 1
	 Scores:
		 accuracy: 0.9509132420091324
		 f1:0.9134808853118712
		 recall:0.9444560016642396
		 precision:0.8844730177284239
Params:
depth: 16, n_estimators: 64, class_weight: balanced, min_samples_leaf: 2
	 Scores:
		 accuracy: 0.9516552511415525
		 f1:0.9146771431449582
		 recall:0.9444560016642396
		 precision:0.88671875
Params:
depth: 16, n_estimators: 64, class_weight: balanced, min_samples_leaf: 3
	 Scores:
		 accuracy: 0.9507420091324201
		 f1:0.9129777150347888
		 recall:0.9417516122321614
		 precision:0.8859099804305284
In [18]:
runs_df = pd.DataFrame(results)
In [20]:
runs_df.sort_values('accuracy', ascending=False)
Out[20]:
accuracy class_weight f1 max_depth min_samples_leaf n_estimators precision recall
12 0.952626 None 0.916800 12 1 64 0.884697 0.951321
7 0.952568 None 0.916171 8 2 64 0.889346 0.944664
15 0.952454 balanced 0.916037 12 1 64 0.888541 0.945288
14 0.952340 None 0.916123 12 3 64 0.885781 0.948617
13 0.952283 None 0.916115 12 2 64 0.884861 0.949657
16 0.952112 balanced 0.915364 12 2 64 0.888562 0.943832
6 0.951998 None 0.915179 8 1 64 0.888215 0.943832
17 0.951941 balanced 0.915189 12 3 64 0.887131 0.945080
9 0.951769 balanced 0.914430 8 1 64 0.890884 0.939255
22 0.951655 balanced 0.914677 16 2 64 0.886719 0.944456
8 0.951655 None 0.914419 8 3 64 0.888998 0.941336
10 0.951598 balanced 0.914031 8 2 64 0.891438 0.937799
18 0.951598 None 0.914928 16 1 64 0.883550 0.948617
11 0.951427 balanced 0.913805 8 3 64 0.890446 0.938423
21 0.950913 balanced 0.913481 16 1 64 0.884473 0.944456
19 0.950856 None 0.913441 16 2 64 0.883852 0.945080
20 0.950742 None 0.913170 16 3 64 0.884256 0.944040
23 0.950742 balanced 0.912978 16 3 64 0.885910 0.941752
1 0.948231 None 0.907835 4 2 64 0.887366 0.929270
0 0.948231 None 0.907853 4 1 64 0.887212 0.929478
4 0.947945 balanced 0.907034 4 2 64 0.889266 0.925525
2 0.947888 None 0.907055 4 3 64 0.888158 0.926773
5 0.947489 balanced 0.906122 4 3 64 0.889245 0.923653
3 0.947089 balanced 0.905205 4 1 64 0.890185 0.920741
In [21]:
runs_df.sort_values('f1', ascending=False)
Out[21]:
accuracy class_weight f1 max_depth min_samples_leaf n_estimators precision recall
12 0.952626 None 0.916800 12 1 64 0.884697 0.951321
7 0.952568 None 0.916171 8 2 64 0.889346 0.944664
14 0.952340 None 0.916123 12 3 64 0.885781 0.948617
13 0.952283 None 0.916115 12 2 64 0.884861 0.949657
15 0.952454 balanced 0.916037 12 1 64 0.888541 0.945288
16 0.952112 balanced 0.915364 12 2 64 0.888562 0.943832
17 0.951941 balanced 0.915189 12 3 64 0.887131 0.945080
6 0.951998 None 0.915179 8 1 64 0.888215 0.943832
18 0.951598 None 0.914928 16 1 64 0.883550 0.948617
22 0.951655 balanced 0.914677 16 2 64 0.886719 0.944456
9 0.951769 balanced 0.914430 8 1 64 0.890884 0.939255
8 0.951655 None 0.914419 8 3 64 0.888998 0.941336
10 0.951598 balanced 0.914031 8 2 64 0.891438 0.937799
11 0.951427 balanced 0.913805 8 3 64 0.890446 0.938423
21 0.950913 balanced 0.913481 16 1 64 0.884473 0.944456
19 0.950856 None 0.913441 16 2 64 0.883852 0.945080
20 0.950742 None 0.913170 16 3 64 0.884256 0.944040
23 0.950742 balanced 0.912978 16 3 64 0.885910 0.941752
0 0.948231 None 0.907853 4 1 64 0.887212 0.929478
1 0.948231 None 0.907835 4 2 64 0.887366 0.929270
2 0.947888 None 0.907055 4 3 64 0.888158 0.926773
4 0.947945 balanced 0.907034 4 2 64 0.889266 0.925525
5 0.947489 balanced 0.906122 4 3 64 0.889245 0.923653
3 0.947089 balanced 0.905205 4 1 64 0.890185 0.920741
In [22]:
runs_df.sort_values('precision', ascending=False)
Out[22]:
accuracy class_weight f1 max_depth min_samples_leaf n_estimators precision recall
10 0.951598 balanced 0.914031 8 2 64 0.891438 0.937799
9 0.951769 balanced 0.914430 8 1 64 0.890884 0.939255
11 0.951427 balanced 0.913805 8 3 64 0.890446 0.938423
3 0.947089 balanced 0.905205 4 1 64 0.890185 0.920741
7 0.952568 None 0.916171 8 2 64 0.889346 0.944664
4 0.947945 balanced 0.907034 4 2 64 0.889266 0.925525
5 0.947489 balanced 0.906122 4 3 64 0.889245 0.923653
8 0.951655 None 0.914419 8 3 64 0.888998 0.941336
16 0.952112 balanced 0.915364 12 2 64 0.888562 0.943832
15 0.952454 balanced 0.916037 12 1 64 0.888541 0.945288
6 0.951998 None 0.915179 8 1 64 0.888215 0.943832
2 0.947888 None 0.907055 4 3 64 0.888158 0.926773
1 0.948231 None 0.907835 4 2 64 0.887366 0.929270
0 0.948231 None 0.907853 4 1 64 0.887212 0.929478
17 0.951941 balanced 0.915189 12 3 64 0.887131 0.945080
22 0.951655 balanced 0.914677 16 2 64 0.886719 0.944456
23 0.950742 balanced 0.912978 16 3 64 0.885910 0.941752
14 0.952340 None 0.916123 12 3 64 0.885781 0.948617
13 0.952283 None 0.916115 12 2 64 0.884861 0.949657
12 0.952626 None 0.916800 12 1 64 0.884697 0.951321
21 0.950913 balanced 0.913481 16 1 64 0.884473 0.944456
20 0.950742 None 0.913170 16 3 64 0.884256 0.944040
19 0.950856 None 0.913441 16 2 64 0.883852 0.945080
18 0.951598 None 0.914928 16 1 64 0.883550 0.948617
In [23]:
runs_df.sort_values('recall', ascending=False)
Out[23]:
accuracy class_weight f1 max_depth min_samples_leaf n_estimators precision recall
12 0.952626 None 0.916800 12 1 64 0.884697 0.951321
13 0.952283 None 0.916115 12 2 64 0.884861 0.949657
18 0.951598 None 0.914928 16 1 64 0.883550 0.948617
14 0.952340 None 0.916123 12 3 64 0.885781 0.948617
15 0.952454 balanced 0.916037 12 1 64 0.888541 0.945288
19 0.950856 None 0.913441 16 2 64 0.883852 0.945080
17 0.951941 balanced 0.915189 12 3 64 0.887131 0.945080
7 0.952568 None 0.916171 8 2 64 0.889346 0.944664
22 0.951655 balanced 0.914677 16 2 64 0.886719 0.944456
21 0.950913 balanced 0.913481 16 1 64 0.884473 0.944456
20 0.950742 None 0.913170 16 3 64 0.884256 0.944040
6 0.951998 None 0.915179 8 1 64 0.888215 0.943832
16 0.952112 balanced 0.915364 12 2 64 0.888562 0.943832
23 0.950742 balanced 0.912978 16 3 64 0.885910 0.941752
8 0.951655 None 0.914419 8 3 64 0.888998 0.941336
9 0.951769 balanced 0.914430 8 1 64 0.890884 0.939255
11 0.951427 balanced 0.913805 8 3 64 0.890446 0.938423
10 0.951598 balanced 0.914031 8 2 64 0.891438 0.937799
0 0.948231 None 0.907853 4 1 64 0.887212 0.929478
1 0.948231 None 0.907835 4 2 64 0.887366 0.929270
2 0.947888 None 0.907055 4 3 64 0.888158 0.926773
4 0.947945 balanced 0.907034 4 2 64 0.889266 0.925525
5 0.947489 balanced 0.906122 4 3 64 0.889245 0.923653
3 0.947089 balanced 0.905205 4 1 64 0.890185 0.920741
In [24]:
runs_df.to_csv('7_abq_ml_exploration_time_freq_rf_time_from_noon_non_directional_diff_ratio_gridsearch.csv')

Best precision model

In [25]:
# precision 
best_params = {'max_depth': 8, 'n_estimators': 64, 'class_weight': 'balanced', 'min_samples_leaf': 2}
In [26]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [27]:
train.df[train.df['Clearsky GHI pvlib'] > 0]['sky_status'].value_counts()
Out[27]:
True     77904
False    72425
Name: sky_status, dtype: int64
In [28]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [29]:
clf = ensemble.RandomForestClassifier(**best_params, n_jobs=-1)
clf.fit(train.df[train.df['GHI'] > 0][feature_cols].values, train.df[train.df['GHI'] > 0][target_cols].values.flatten())
Out[29]:
RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=8, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=64, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
In [30]:
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
In [31]:
%%time
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

CPU times: user 3.34 s, sys: 1.19 s, total: 4.53 s
Wall time: 1min
In [32]:
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only')
vis.add_line_ser(test.df['abs_ideal_ratio_diff'] * 100)
vis.add_line_ser(test.df['GHI Clearsky GHI pvlib abs_diff'])
vis.show()
In [33]:
cm = metrics.confusion_matrix(test.df['sky_status'].values, pred)
vis = visualize.Visualizer()
vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])
In [34]:
print(metrics.f1_score(test.df['sky_status'].values, pred))
0.879036795319
In [35]:
bar = go.Bar(x=feature_cols, y=clf.feature_importances_)
iplot([bar])

Accuracy, f1, recall best model

In [36]:
# accuracy, f1, recall
best_params = {'max_depth': 12, 'n_estimators': 64, 'class_weight': None, 'min_samples_leaf': 1}
In [37]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.trim_dates(None, '01-01-2015')
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
In [38]:
train.df[train.df['Clearsky GHI pvlib'] > 0]['sky_status'].value_counts()
Out[38]:
True     77904
False    72425
Name: sky_status, dtype: int64
In [39]:
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
In [40]:
clf = ensemble.RandomForestClassifier(**best_params, n_jobs=-1)
clf.fit(train.df[train.df['GHI'] > 0][feature_cols].values, train.df[train.df['GHI'] > 0][target_cols].values.flatten())
Out[40]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=64, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
In [41]:
test = cs_detection.ClearskyDetection(nsrdb.df)
test.trim_dates('01-01-2015', None)
In [42]:
%%time
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=True, by_day=True).astype(bool)
/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

/Users/benellis/duramat/clearsky_detection/utils.py:331: RuntimeWarning:

Large scaling value.  Day will not be further assessed or scaled.

/Users/benellis/duramat/clearsky_detection/utils.py:338: RuntimeWarning:

Scaling did not converge.

CPU times: user 6.38 s, sys: 5.27 s, total: 11.6 s
Wall time: 1min
In [43]:
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[(test.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only')
vis.add_line_ser(test.df['abs_ideal_ratio_diff'] * 100)
vis.add_line_ser(test.df['GHI Clearsky GHI pvlib abs_diff'])
vis.show()
In [44]:
cm = metrics.confusion_matrix(test.df['sky_status'].values, pred)
vis = visualize.Visualizer()
vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])
In [45]:
print(metrics.f1_score(test.df['sky_status'].values, pred))
0.884280936455
In [46]:
bar = go.Bar(x=feature_cols, y=clf.feature_importances_)
iplot([bar])

Train on all NSRDB data, test various freq of ground data

In [ ]:
train = cs_detection.ClearskyDetection(nsrdb.df)
train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())
In [ ]:
bar = go.Bar(x=feature_cols, y=clf.feature_importances_)
iplot([bar])

30 min freq ground data

In [ ]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)
In [ ]:
test.trim_dates('10-01-2015', '11-01-2015')
In [ ]:
test.df = test.df[test.df.index.minute % 30 == 0]
In [ ]:
test.df.keys()
In [ ]:
test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
In [ ]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, multiproc=False, by_day=False).astype(bool)
In [ ]:
train2 = cs_detection.ClearskyDetection(nsrdb.df)
train2.intersection(test.df.index)
In [ ]:
nsrdb_clear = train2.df['sky_status'].values
ml_clear = pred
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()
make_subplots(test, train2, 8, 2, 0, width=1000, height=1200)probas = clf.predict_proba(test.df[feature_cols].values)test.df['probas'] = 0test.df['probas'] = probas[:, 1] trace0 = go.Scatter(x=test.df.index, y=test.df['GHI'], name='GHI') trace1 = go.Scatter(x=test.df.index, y=test.df['Clearsky GHI pvlib'], name='GHIcs') trace2 = go.Scatter(x=test.df.index, y=test.df['GHI'], name='prob', mode='markers', marker={'size': 12, 'color': test.df['probas'], 'colorscale': 'Viridis', 'showscale': True}, text='prob: ' + test.df['probas'].astype(str)) iplot([trace0, trace1, trace2])
In [ ]:
probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas[:, 1]
# test.df['probas'] = test.df['probas'].rolling(3, center=True).mean()
visualize.plot_ts_slider_highligther(test.df, prob='probas')

15 min freq ground data

In [ ]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)
In [ ]:
test.trim_dates('10-01-2015', '10-17-2015')
In [ ]:
test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
In [ ]:
test.df = test.df[test.df.index.minute % 15 == 0]
# test.df = test.df.resample('15T').apply(lambda x: x[len(x) // 2])
In [ ]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 5, multiproc=True, by_day=True).astype(bool)
In [ ]:
train2 = cs_detection.ClearskyDetection(train.df)
train2.trim_dates('10-01-2015', '10-17-2015')
train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='15min'))
train2.df['sky_status'] = train2.df['sky_status'].fillna(False)
In [ ]:
nsrdb_clear = train2.df['sky_status']
ml_clear = test.df['sky_status iter']
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()
nsrdb_clear = train2.df['sky_status'].astype(bool) ml_clear = pred test.df['nsrdb_sky'] = nsrdb_clear test.df['nsrdb_sky'] = test.df['nsrdb_sky'].replace(np.nan, False) test.df['ml_sky'] = pred make_subplots(test, train2, 8, 2, 0, width=1000, height=1200)
In [ ]:
probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas[:, 1]
visualize.plot_ts_slider_highligther(test.df, prob='probas')

10 min freq ground data

In [ ]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)
In [ ]:
test.trim_dates('10-01-2015', '10-08-2015')
In [ ]:
test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
test.scale_by_irrad('Clearsky GHI pvlib')
In [ ]:
test.df = test.df[test.df.index.minute % 10 == 0]
In [ ]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 7, multiproc=False, by_day=False).astype(bool)
In [ ]:
train2 = cs_detection.ClearskyDetection(train.df)
train2.trim_dates('10-01-2015', '10-08-2015')
train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='10min'))
train2.df['sky_status'] = train2.df['sky_status'].fillna(False)
In [ ]:
nsrdb_clear = train2.df['sky_status']
ml_clear = test.df['sky_status iter']
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()
In [ ]:
probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas[:, 1]

visualize.plot_ts_slider_highligther(test.df, prob='probas')

5 min freq ground data

In [ ]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)
In [ ]:
test.trim_dates('10-01-2015', '10-17-2015')
In [ ]:
test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
test.scale_by_irrad('Clearsky GHI pvlib')
In [ ]:
test.df = test.df[test.df.index.minute % 5 == 0]
In [ ]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 13, multiproc=True, by_day=True).astype(bool)
In [ ]:
train2 = cs_detection.ClearskyDetection(train.df)
train2.trim_dates('10-01-2015', '10-17-2015')
train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='5min'))
train2.df['sky_status'] = train2.df['sky_status'].fillna(False)
In [ ]:
nsrdb_clear = train2.df['sky_status']
ml_clear = test.df['sky_status iter']
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()
nsrdb_clear = train2.df['sky_status'] ml_clear = test.df['sky_status iter'] vis = visualize.Visualizer() vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'Method 1') vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'Method 2') vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Method 1+2') vis.show()from plotly import tools as tls nsrdb_clear = train2.df['sky_status'] ml_clear = pred print(len(nsrdb_clear), len(test.df)) test.df['nsrdb_sky'] = nsrdb_clear test.df['ml_sky'] = pred colors = { 'blue': '#1f77b4', 'orange': '#ff7f0e', 'green': '#2ca02c', 'red': '#d62728', 'purple': '#9467bd', 'brown': '#8c564b', 'pink': '#e377c2', 'gray': '#7f7f7f', 'yellow': '#bcbd22', 'teal': '#17becf' } ghi_line = {'color': colors['blue']} ghics_line = {'color': colors['orange']} ml_only = {'color': colors['green']} nsrdb_only = {'color': colors['red']} both = {'color': colors['purple']} nrow, ncol = 3, 3 fig = tls.make_subplots(rows=nrow, cols=ncol, shared_xaxes=True, shared_yaxes=True, print_grid=True) for i, (name, g) in enumerate(test.df.groupby(test.df.index.date)): if i == nrow * ncol: break legend = False if i == 0: legend = True g = g.between_time('05:00:00', '19:00:00') g.index = range(len(g)) trace0 = go.Scatter(x=g.index, y=g['GHI'], line=ghi_line, showlegend=legend, name='GHI') trace1 = go.Scatter(x=g.index, y=g['Clearsky GHI pvlib'], line=ghics_line, showlegend=legend, name='GHIcs') trace2 = go.Scatter(x=g[g['ml_sky'] & ~g['nsrdb_sky']].index, y=g[g['ml_sky'] & ~g['nsrdb_sky']]['GHI'], mode='markers', marker=ml_only, showlegend=legend, name='Method 1') trace3 = go.Scatter(x=g[~g['ml_sky'] & g['nsrdb_sky']].index, y=g[~g['ml_sky'] & g['nsrdb_sky']]['GHI'], mode='markers', marker=nsrdb_only, showlegend=legend, name='Method 2') trace4 = go.Scatter(x=g[g['ml_sky'] & g['nsrdb_sky']].index, y=g[g['ml_sky'] & g['nsrdb_sky']]['GHI'], mode='markers', marker=both, showlegend=legend, name='Method 1 & 2') row = i % nrow + 1 col = i // ncol + 1 traces = [trace0, trace1, trace2, trace3, trace4] for t in traces: fig.append_trace(t, row, col) iplot(fig, layout)probas = clf.predict_proba(test.df[feature_cols].values) test.df['probas'] = 0 test.df['probas'] = probas trace0 = go.Scatter(x=test.df.index, y=test.df['GHI'], name='GHI') trace1 = go.Scatter(x=test.df.index, y=test.df['Clearsky GHI pvlib'], name='GHIcs') trace2 = go.Scatter(x=test.df.index, y=test.df['GHI'], name='prob', mode='markers', marker={'size': 10, 'color': test.df['probas'], 'colorscale': 'Viridis', 'showscale': True}, text=test.df['probas']) iplot([trace0, trace1, trace2])
In [ ]:
probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas

visualize.plot_ts_slider_highligther(test.df, prob='probas')

1 min freq ground data

In [ ]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')
test = cs_detection.ClearskyDetection(ground.df)
In [ ]:
test.trim_dates('10-01-2015', '10-17-2015')
In [ ]:
test.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
test.scale_by_irrad('Clearsky GHI pvlib')
In [ ]:
test.df = test.df[test.df.index.minute % 1 == 0]
In [ ]:
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 61, multiproc=True, by_day=True).astype(bool)
In [ ]:
train2 = cs_detection.ClearskyDetection(train.df)
train2.trim_dates('10-01-2015', '10-17-2015')
train2.df = train2.df.reindex(pd.date_range(start=train2.df.index[0], end=train2.df.index[-1], freq='1min'))
train2.df['sky_status'] = train2.df['sky_status'].fillna(False)
In [ ]:
nsrdb_clear = train2.df['sky_status']
ml_clear = test.df['sky_status iter']
vis = visualize.Visualizer()
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[ml_clear & ~nsrdb_clear]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[~ml_clear & nsrdb_clear]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[ml_clear & nsrdb_clear]['GHI'], 'Both clear')
vis.show()
probas = clf.predict_proba(test.df[feature_cols].values) test.df['probas'] = 0 test.df['probas'] = probas trace0 = go.Scatter(x=test.df.index, y=test.df['GHI'], name='GHI') trace1 = go.Scatter(x=test.df.index, y=test.df['Clearsky GHI pvlib'], name='GHIcs') trace2 = go.Scatter(x=test.df.index, y=test.df['GHI'], name='prob', mode='markers', marker={'size': 10, 'color': test.df['probas'], 'colorscale': 'Viridis', 'showscale': True}, text=test.df['probas']) iplot([trace0, trace1, trace2])
In [ ]:
probas = clf.predict_proba(test.df[feature_cols].values)
test.df['probas'] = 0
test.df['probas'] = probas
visualize.plot_ts_slider_highligther(test.df, prob='probas')

Save model

In [ ]:
import pickle
In [ ]:
with open('abq_trained.pkl', 'wb') as f:
    pickle.dump(clf, f)
In [ ]:
!ls abq*

Conclusion

In general, the clear sky identification looks good. At lower frequencies (30 min, 15 min) we see good agreement with NSRDB labeled points. I suspect this could be further improved my doing a larger hyperparameter search, or even doing some feature extraction/reduction/additions.

In [ ]: